SARIMA Wind Simulator

Code Author: Saf Flatters

Data Source: Ballarat Wind Observations Dataset

1. Load Required Libraries

# install.packages(c(
#   "fitdistrplus",
#   "tidyverse",
#   "forecast",
#   "ggplot2",
#   "tseries",
#   "lubridate",
#   "weibullness",
#   "TTR",
#   "plotly"
# ))


# Load libraries
library(fitdistrplus)  # For Weibull distribution fitting
## Warning: package 'fitdistrplus' was built under R version 4.4.3
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.4.3
## Loading required package: survival
library(tidyverse)    # Multiple uses
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forecast)     # For ARIMA modelling
## Warning: package 'forecast' was built under R version 4.4.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(ggplot2)      # For plotting
library(tseries)      # For stationarity tests
## Warning: package 'tseries' was built under R version 4.4.3
library(lubridate)    # For date-time manipulation
library(weibullness)  # Goodness of fit testing of Weibull MLE
## 
##  weibullness Package is installed.
library(TTR)          # Identifying trends with 7-day smoothing
## Warning: package 'TTR' was built under R version 4.4.3
library(stats4)       # testing non standard MLEs
library(plotly)       # interactive ggplots
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:MASS':
## 
##     select
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

2. Load & Explore Data

# Import data
wind <- read_csv("wind-observations_ballarat.csv")
## Rows: 60416 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): location_description, wind_direction_cardinal, point, polar
## dbl  (7): latitude, longitude, average_wind_speed, gust_speed, wind_directio...
## dttm (1): date_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(wind)
## # A tibble: 6 × 12
##   date_time           location_description latitude longitude average_wind_speed
##   <dttm>              <chr>                   <dbl>     <dbl>              <dbl>
## 1 2024-01-04 20:37:30 Rowing Course           -37.6      144.               6.45
## 2 2024-01-04 21:22:17 Rowing Course           -37.6      144.               5.39
## 3 2024-01-04 23:23:33 Rowing Course           -37.6      144.               3.15
## 4 2024-01-04 23:53:11 Rowing Course           -37.6      144.               2.97
## 5 2024-01-05 02:54:10 Rowing Course           -37.6      144.               2.99
## 6 2024-01-05 06:23:45 Rowing Course           -37.6      144.               7.11
## # ℹ 7 more variables: gust_speed <dbl>, wind_direction <dbl>,
## #   wind_direction_cardinal <chr>, point <chr>, wind_speed_average <dbl>,
## #   wind_speed_gust <dbl>, polar <chr>
str(wind)
## spc_tbl_ [60,416 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date_time              : POSIXct[1:60416], format: "2024-01-04 20:37:30" "2024-01-04 21:22:17" ...
##  $ location_description   : chr [1:60416] "Rowing Course" "Rowing Course" "Rowing Course" "Rowing Course" ...
##  $ latitude               : num [1:60416] -37.6 -37.6 -37.6 -37.6 -37.6 ...
##  $ longitude              : num [1:60416] 144 144 144 144 144 ...
##  $ average_wind_speed     : num [1:60416] 6.45 5.39 3.15 2.97 2.99 7.11 4.65 4.12 4.63 2.2 ...
##  $ gust_speed             : num [1:60416] 10.27 7.6 4.74 4.86 5.07 ...
##  $ wind_direction         : num [1:60416] 107 99.9 81.2 74.1 152.8 ...
##  $ wind_direction_cardinal: chr [1:60416] "ESE" "E" "E" "ENE" ...
##  $ point                  : chr [1:60416] "-37.554098, 143.824685" "-37.554098, 143.824685" "-37.554098, 143.824685" "-37.554098, 143.824685" ...
##  $ wind_speed_average     : num [1:60416] 23.2 19.4 11.3 10.7 10.8 ...
##  $ wind_speed_gust        : num [1:60416] 37 27.4 17.1 17.5 18.3 ...
##  $ polar                  : chr [1:60416] "06 ESE" "05 E" "05 E" "04 ENE" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date_time = col_datetime(format = ""),
##   ..   location_description = col_character(),
##   ..   latitude = col_double(),
##   ..   longitude = col_double(),
##   ..   average_wind_speed = col_double(),
##   ..   gust_speed = col_double(),
##   ..   wind_direction = col_double(),
##   ..   wind_direction_cardinal = col_character(),
##   ..   point = col_character(),
##   ..   wind_speed_average = col_double(),
##   ..   wind_speed_gust = col_double(),
##   ..   polar = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Plot raw data

# Plot raw wind speed over time
wind <- wind[order(wind$date_time), ]   # put in date order 
plot(wind$date_time, 
     wind$average_wind_speed, 
     type = "l", 
     xlab = "Date/Time", 
     ylab = "Wind Speed (m/s)", 
     main = "Wind Speed Over Time - Raw Data")

Filter and Inspect Observational Density

I examined the observational density to understand how consistently wind speed measurements were recorded over time as it appeared to be inconsistent. This helped me plan how to aggregate the data for modelling.

# Convert date_time column to proper datetime format and filter to one year of data
wind <- wind %>%
  mutate(date_time = ymd_hms(date_time)) %>%
  filter(date_time >= as.POSIXct("2023-04-01") & date_time < as.POSIXct("2024-04-01"))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_time = ymd_hms(date_time)`.
## Caused by warning:
## !  1 failed to parse.
# Calculate number of observations per hour
observations_per_hour <- wind %>%
  mutate(hourly_time = floor_date(date_time, "hour")) %>%
  group_by(hourly_time) %>%
  summarise(obs_count = n())

# Plot frequency of observation counts
ggplot(observations_per_hour, 
       aes(x = obs_count)) +
  geom_bar(fill = "skyblue", 
           color = "black") +
  labs(title = "Frequency of Observations per Hour",
       x = "Observations per Hour",
       y = "Number of Hours") +
  theme_minimal()

3. Preprocess Data

Clean, Filter, Aggregate

Cleaned data: removed not-required columns, stripped date_time column to convert entries into POSIXct objects.

Filtered data: removed data from outside the target period of April 2023 to March 2024 to have a consistent one-year time window for analysis.

Aggregated data: averaged wind speed measurements by hour using the floor of each timestamp to create a regular, evenly spaced time series suitable for ARIMA modelling

# put data in date/time order
wind <- wind[order(wind$date_time), ]

# remove unrequired columns
wind$location_description <- NULL
wind$latitude <- NULL
wind$longitude <- NULL

# Convert date_time
wind$date_time <- sub("\\+00:00$", "", wind$date_time) # Remove the "+00:00" timezone suffix 
wind$date_time <- ymd_hms(wind$date_time) # Convert to POSIXct datetime objects

# rename columns
names(wind)[names(wind) == "average_wind_speed"] <- "wind_speed"

# Keep only date_time and wind_speed
wind <- wind[ , 1:2]

# Filter to April 2023-April 2024
wind <- wind %>%
  mutate(date_time = ymd_hms(date_time)) %>%
  filter(date_time >= as.POSIXct("2023-04-01") & date_time < as.POSIXct("2024-04-01"))

# Aggregate the data to hourly averages:
wind_hourly <- wind %>%
  mutate(hourly_time = floor_date(date_time, "hour")) %>%  # Round down each timestamp to the hour
  group_by(hourly_time) %>%    # Group by the hourly time
  summarise(
    wind_speed = mean(wind_speed, na.rm = TRUE),  # Compute the mean wind speed for each hour 
  ) %>%
  ungroup()

# Check for any missing values in hourly_time column
cat("Missing Values: ")
## Missing Values:
sum(is.na(wind_hourly$hourly_time))
## [1] 0
wind_speed_data <- wind_hourly
head(wind_speed_data, 24)
## # A tibble: 24 × 2
##    hourly_time         wind_speed
##    <dttm>                   <dbl>
##  1 2023-03-31 16:00:00       1.96
##  2 2023-03-31 17:00:00       2.71
##  3 2023-03-31 18:00:00       3.17
##  4 2023-03-31 19:00:00       2.9 
##  5 2023-03-31 20:00:00       3.46
##  6 2023-03-31 21:00:00       3.68
##  7 2023-03-31 22:00:00       3.62
##  8 2023-03-31 23:00:00       4.82
##  9 2023-04-01 00:00:00       6.40
## 10 2023-04-01 01:00:00       5.95
## # ℹ 14 more rows

Explore for patterns

Plot whole timeseries to look for seasonality, trends, anomalies

Plot 7 day smoothed timeseries to look for longer-term trends

#Plot whole timeseries to look for seasonality, trends, anomalies

# Set frequency assuming hourly data (24 obs per day)
ts_wind <- ts(wind_speed_data$wind_speed, 
              start = c(2023, 16))

# Plot the time series
autoplot(ts_wind) + 
  ggtitle("Wind Speed Time Series")+
  xlab("Elapsed Time (Hourly") + 
  ylab("Wind Speed (m/s)")

# Plot 7 day smoothed timeseries to look for longer-term trends 

# Apply a rolling mean to highlight trend shifts (7-day window)
wind_speed_data$trend <- SMA(wind_speed_data$wind_speed, 
                             n=24*7)  # 7-day smoothing

# Plot the smoothed trend
ggplot(wind_speed_data, aes(x=hourly_time, y=trend)) +
  geom_line(color="blue") +
  ggtitle("Smoothed Wind Speed Trend (7-day Moving Average)") +
  xlab("Elapsed Time (Hourly)") + 
  ylab("Wind Speed (m/s")
## Warning: Removed 167 rows containing missing values or values outside the scale range
## (`geom_line()`).

4. Distribution Modelling

Proposed Distribution

Weibull distribution to the observed hourly wind speed data using Maximum Likelihood Estimation (MLE)

set.seed(123)  # reproducibility
wind_sample <- sample(wind_speed_data$wind_speed, 1000)  # can only have up to 1000 samples for test so randomly sampled from observations

# weibullness test - Null Hypothesis is data follows Weibull
wp_test_result <- wp.test(wind_sample)
wp_test_result
## 
##  Weibullness test from the Weibull plot
## 
## data:  wind_sample
## correlation = 0.99727278, p-value = 0.1133085

Fit Wiebull on Annual data using MLE

# Fit a Weibull distribution to the wind speed data of entire year 
annual_weibull_fit <- fitdist(wind_speed_data$wind_speed, 
                              "weibull", 
                              method = "mle")  

# Extract parameters 
shape_param <- annual_weibull_fit$estimate["shape"] 
scale_param <- annual_weibull_fit$estimate["scale"]  

# Plot the fitted Weibull distribution 
ggplot(wind_speed_data, aes(x = wind_speed)) +
    geom_histogram(aes(y = ..density..), 
                     bins = 30, 
                     fill = "blue", 
                     alpha = 0.5) + 
    stat_function(fun = dweibull, 
                   args = list(shape = shape_param, 
                              scale = scale_param),
                   color = "red", 
                   size = 1) +
    ggtitle("Fitted Weibull Distribution for Wind Speed") +
    xlab("Wind Speed (m/s)") +
    ylab("Density")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

annual_weibull_fit
## Fitting of the distribution ' weibull ' by maximum likelihood 
## Parameters:
##          estimate    Std. Error
## shape 2.017811103 0.01674106814
## scale 5.136559196 0.02885195187
# Analyse fit against weibull dist
qqcomp(list(annual_weibull_fit), 
       legendtext = "Weibull")

Fit Weibull on Seasonal data using MLE

# add season column to dataframe
wind_speed_data <- wind_speed_data %>%
  mutate(
    month = month(hourly_time),
    season = case_when(
      month %in% c(12, 1, 2) ~ "Summer",
      month %in% c(3, 4, 5) ~ "Autumn",
      month %in% c(6, 7, 8) ~ "Winter",
      month %in% c(9, 10, 11) ~ "Spring"
    )
  )

# fit weibull to each season separately
seasonal_params <- wind_speed_data %>%
  group_by(season) %>%
  summarise(
    shape = fitdist(wind_speed, "weibull", method = "mle")$estimate["shape"],
    scale = fitdist(wind_speed, "weibull", method = "mle")$estimate["scale"],
    .groups = "drop"
  )
## Warning: There were 16 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `shape = fitdist(wind_speed, "weibull", method =
##   "mle")$estimate["shape"]`.
## ℹ In group 1: `season = "Autumn"`.
## Caused by warning in `dweibull()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 15 remaining warnings.
seasonal_params
## # A tibble: 4 × 3
##   season shape scale
##   <chr>  <dbl> <dbl>
## 1 Autumn  1.79  4.38
## 2 Spring  2.18  5.35
## 3 Summer  2.45  5.44
## 4 Winter  1.88  5.34
# plot 4 seasons PDF with red theoretical PDF

# Create Weibull PDF data frame
pdf_data <- wind_speed_data %>%
  group_by(season) %>%
  summarise(x = list(seq(min(wind_speed), 
                         max(wind_speed), 
                         length.out = 300))) %>%
  unnest(x) %>%
  left_join(seasonal_params, 
            by = "season") %>%
  mutate(pdf = dweibull(x, 
                        shape = shape, 
                        scale = scale))

ggplot(wind_speed_data, aes(x = wind_speed, fill = season)) +
  geom_density(alpha = 0.4) +
  geom_line(data = pdf_data, 
            aes(x = x, y = pdf), 
            colour = "red", 
            inherit.aes = FALSE) +
  facet_wrap(~season, scales = "free_y") +
  labs(title = "Wind Speed Distributions by Season",
       x = "Wind Speed (m/s)",
       y = "Density") +
  theme_minimal()

# plot QQ plots for each season 

# Generate theoretical Weibull quantiles for each row using seasonal shape/scale
qq_data <- wind_speed_data %>%
  left_join(seasonal_params, by = "season") %>%
  group_by(season) %>%
  arrange(wind_speed) %>%
  mutate(
    n = n(),
    p = ppoints(n),  # generate uniform probs for quantiles
    weibull_q = qweibull(p, 
                         shape = shape, 
                         scale = scale)
  ) %>%
  ungroup()

# Plot Q–Q plots by season
ggplot(qq_data, 
       aes(x = weibull_q, 
                    y = wind_speed, 
                    colour = season)) +
      geom_point(alpha = 0.4) +
      geom_abline(slope = 1, 
                  intercept = 0, 
                  colour = "black") +
      facet_wrap(~season, 
                 scales = "free") +
      labs(title = "Q–Q Plots of Weibull Fits by Season",
       x = "Theoretical Quantiles (Weibull)",
       y = "Empirical Quantiles (Observed Wind Speed)") +
  theme_minimal() + 
  theme(legend.position = "none")  

Compare Annual vs Seasonal Weibull fit

# store each season model as fit object
seasonal_fits <- wind_speed_data %>%
  group_by(season) %>%
  summarise(
    fit = list(fitdist(wind_speed, "weibull")), # MLE fit for each season
    .groups = "drop"
  )
## Warning: There were 8 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `fit = list(fitdist(wind_speed, "weibull"))`.
## ℹ In group 1: `season = "Autumn"`.
## Caused by warning in `dweibull()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.
# Extract AICs and log-likelihoods for ANNUAL
loglik_annual <- logLik(annual_weibull_fit)
aic_annual <- AIC(annual_weibull_fit)

# Extract AICs and log-likelihoods for each season                  
seasonal_fits <- seasonal_fits %>%
  mutate(
    loglik = map_dbl(fit, ~ as.numeric(logLik(.))),
    aic = map_dbl(fit, AIC)
  )


# Sum the seasonal log-likelihoods and AICs, and label as "Seasonal" model
comparison <- seasonal_fits %>%
  summarise(
    total_loglik = sum(loglik),
    total_aic = sum(aic)
  ) %>%
  mutate(
    model = "Seasonal"
  ) %>%
  
# Add the annual model results to comparison
  bind_rows(tibble(
    model = "Annual",
    total_loglik = as.numeric(loglik_annual),
    total_aic = aic_annual
  ))

comparison
## # A tibble: 2 × 3
##   total_loglik total_aic model   
##          <dbl>     <dbl> <chr>   
## 1      -19016.    38048. Seasonal
## 2      -19215.    38434. Annual

Simulate Wind Speeds using fitted models

# Simulate seasonal wind speeds using seasonal Weibull parameters
set.seed(123)  # Reproducibility

# Join fitted shape/scale to the original data
wind_seasonal <- wind_speed_data %>%
  left_join(seasonal_params, 
            by = "season")

# Simulate wind speeds per row using the season's parameters
wind_seasonal <- wind_seasonal %>%
  rowwise() %>%
  mutate(wind_speed = rweibull(1, 
                               shape = shape, 
                               scale = scale)) %>%
  ungroup()

# Store simulated values in a separate data frame
simulated_data <- wind_seasonal %>%
  select(season, wind_speed)

# Compare summary statistics
cat("Simulated Data:\n")
## Simulated Data:
summary(simulated_data$wind_speed)
##        Min.     1st Qu.      Median        Mean     3rd Qu.        Max. 
##  0.06178181  2.82084129  4.34837356  4.57010047  6.03621303 15.09884434
cat("\nActual Data:\n")
## 
## Actual Data:
summary(wind_speed_data$wind_speed)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  0.310000  2.780000  4.270000  4.545675  5.948750 15.142500

Validate Simulated data

# Plot simulated vs actual wind speed distributions
ggplot() +
  geom_density(data = wind_speed_data, 
               aes(x = wind_speed, 
                   color = "Actual Wind Speed"), 
                   size = 1) +
  geom_density(data = simulated_data, 
               aes(x = wind_speed, 
                   color = "Simulated Wind Speed"), 
                   size = 1, 
                   linetype = "dashed") +
  ggtitle("Comparison: Simulated vs. Actual Wind Speed Distributions") +
  xlab("Wind Speed (m/s)") + 
  ylab("Density") +
  scale_color_manual(values = c("Actual Wind Speed" = "blue", 
                                "Simulated Wind Speed" = "tomato"))

# data frame of quantiles
qq_df <- data.frame(
  actual = sort(wind_speed_data$wind_speed),
  simulated = sort(simulated_data$wind_speed)
)

# Q–Q plot
ggplot(qq_df, aes(x = actual, 
                  y = simulated)) +
  geom_point(alpha = 0.4, 
             colour = "purple") +
  geom_abline(slope = 1, 
              intercept = 0, 
              colour = "black") +
  labs(
    title = "Q–Q Plot: Actual vs Simulated Wind Speed",
    x = "Actual Quantiles",
    y = "Simulated Quantiles"
  ) +
  theme_minimal(base_size = 12)

5. SARIMA

Convert to Time series

# convert to Timeseries
ts_wind <- ts(wind_speed_data$wind_speed, 
              frequency = 24)
# plot Autocorrelation to look for seasonal patterns
ggAcf(ts_wind, lag.max = 200) + 
  ggtitle("Autocorrelation Function (ACF) of Hourly Wind Speed")

Test for Stationarity

# Test for stationarity
adf_test <- adf.test(ts_wind)
## Warning in adf.test(ts_wind): p-value smaller than printed p-value
adf_test
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_wind
## Dickey-Fuller = -13.291272, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary

Model Selection: SARIMA with auto.arima

# find model params with auto.arima 
fit_sarima <- auto.arima(ts_wind, seasonal = TRUE)
summary(fit_sarima)
## Series: ts_wind 
## ARIMA(0,1,1)(0,0,2)[24] 
## 
## Coefficients:
##             ma1       sma1       sma2
##       0.1123117  0.0587036  0.0510640
## s.e.  0.0108745  0.0107938  0.0103651
## 
## sigma^2 = 0.7790499:  log likelihood = -11197.73
## AIC=22403.47   AICc=22403.47   BIC=22431.73
## 
## Training set error measures:
##                           ME         RMSE          MAE          MPE        MAPE
## Training set 0.0001938985014 0.8824340356 0.6463653384 -3.135159909 18.52251643
##                      MASE            ACF1
## Training set 0.2814236465 -0.001578571712
checkresiduals(fit_sarima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(0,0,2)[24]
## Q* = 209.24384, df = 45, p-value < 2.2204e-16
## 
## Model df: 3.   Total lags used: 48

Model Selection: SARIMA manually

Plot ACF, PACF and Differencing

# ACF and PACF with No differencing
ggtsdisplay(ts_wind, lag = 24, 
            main = "Raw - no differencing")

# ACF and PACF with differencing
ggtsdisplay(diff(ts_wind), lag = 24, 
            main = "First Difference")

Try Multiple Combinations

# Function to allow for SARIMA manual combinations: returns model and residual check
fit_sarima <- function(ts_data, 
                      order = c(0, 0, 0), 
                      seasonal_order = c(0, 0, 0), 
                      seasonal_period = 24) {
  model <- Arima(ts_data,
                 order = order,
                 seasonal = list(order = seasonal_order,
                                 period = seasonal_period))
  
  model
  checkresiduals(model)
  
  return(model)
}
# This code block is used to test multiple combinations of SARIMA and check AIC and residual ACF 
fit_sarima(ts_wind, 
           order = c(2, 1, 2), 
           seasonal_order = c(1, 0, 1))

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)(1,0,1)[24]
## Q* = 85.210873, df = 42, p-value = 9.131858e-05
## 
## Model df: 6.   Total lags used: 48
## Series: ts_data 
## ARIMA(2,1,2)(1,0,1)[24] 
## 
## Coefficients:
##             ar1         ar2         ma1        ma2       sar1        sma1
##       1.5004928  -0.5492569  -1.4673250  0.4675550  0.6616748  -0.5939269
## s.e.  0.0497728   0.0457704   0.0550776  0.0550052  0.0664114   0.0686531
## 
## sigma^2 = 0.7411406:  log likelihood = -10982.39
## AIC=21978.78   AICc=21978.8   BIC=22028.24

Select Best Model

# best model found from Model Selection sections
bestmodel <- fit_sarima(ts_wind,
                   order = c(2,1,2),
                   seasonal_order = c(1, 0, 1))

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)(1,0,1)[24]
## Q* = 85.210873, df = 42, p-value = 9.131858e-05
## 
## Model df: 6.   Total lags used: 48
bestmodel
## Series: ts_data 
## ARIMA(2,1,2)(1,0,1)[24] 
## 
## Coefficients:
##             ar1         ar2         ma1        ma2       sar1        sma1
##       1.5004928  -0.5492569  -1.4673250  0.4675550  0.6616748  -0.5939269
## s.e.  0.0497728   0.0457704   0.0550776  0.0550052  0.0664114   0.0686531
## 
## sigma^2 = 0.7411406:  log likelihood = -10982.39
## AIC=21978.78   AICc=21978.8   BIC=22028.24

6. Forecast Wind

# forecast next 48 hours using SARIMA best model and plot
forecast_wind <- forecast(bestmodel, h = 48)
autoplot(forecast_wind) +
  ggtitle("SARIMA Forecast for Wind Speed") +
  ylab("Wind Speed (m/s)") +
  xlab("Time") +
  theme_minimal()

Interactive Forecast Plot

# Final Presentation plot of Forecast 

# For better visualisation - zoom up on last 10 days
# Extract the last 240 observations (10 days)
last_10days <- window(forecast_wind$x, start = tail(time(forecast_wind$x), 240)[1])

# Recompute the forecast model based only on the last 10 days
forecast_limited <- forecast(last_10days, model = forecast_wind$bestmodel)

# Define number of points
n_obs <- length(forecast_limited$x)
n_forecast <- length(forecast_limited$mean)
total_points <- n_obs + n_forecast

# Create x-axis labels: Day 1–10 (observed), F1–F2 (forecast)
x_labels <- c(
  rep(paste0(10:1)),
  rep(c("F1", "F2"))
)

# Build data frame and ensure prediction intervals are not below 0
df <- data.frame(
  time_index = 1:total_points,
  value = c(as.numeric(forecast_limited$x), as.numeric(forecast_limited$mean)),
  type = c(rep("Observed", n_obs), rep("Forecast", n_forecast)),
  upper_80 = c(rep(NA, n_obs), forecast_limited$upper[, "80%"]),
  lower_80 = c(rep(NA, n_obs), pmax(0, forecast_limited$lower[, "80%"])),
  upper_95 = c(rep(NA, n_obs), forecast_limited$upper[, "95%"]),
  lower_95 = c(rep(NA, n_obs), pmax(0, forecast_limited$lower[, "95%"]))
)

# For legend
ribbon_data <- rbind(
  data.frame(time_index = (n_obs + 1):total_points,
             ymin = df$lower_80[(n_obs + 1):total_points],
             ymax = df$upper_80[(n_obs + 1):total_points],
             Interval = "80% PI"),
  data.frame(time_index = (n_obs + 1):total_points,
             ymin = df$lower_95[(n_obs + 1):total_points],
             ymax = df$upper_95[(n_obs + 1):total_points],
             Interval = "95% PI")
)


# Plot with both 80% and 95% prediction intervals
p <- ggplot() +
  geom_ribbon(data = ribbon_data, 
              aes(x = time_index, ymin = ymin, ymax = ymax, fill = Interval), 
              alpha = 0.3) +
  geom_line(data = df, aes(x = time_index, y = value, colour = type)) +
  scale_fill_manual(values = c("80% PI" = "blue", "95% PI" = "blue")) +
  scale_colour_manual(values = c("Observed" = "#00BFC4", "Forecast" = "#F8766D")) +
  guides(fill = guide_legend(title = NULL),  # Remove fill legend title
         colour = guide_legend(title = NULL)) +  # Remove colour legend title
  scale_x_continuous(
    breaks = seq(12, total_points, by = 24),
    labels = unique(x_labels)
  ) +
  ylab("Wind Speed (m/s)") +
  xlab("Days before Forecast") +
  labs(
    title = "SARIMA 48 Hour Forecast for Wind Speed\nPrevious 10 Days + Forecast Days (F1, F2)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 12)
  )

ggplotly(p)   # interactive plot

APPENDIX:

Dealing with complex Seasonality

  • Separate the Seasons: Meteorological Season-specific SARIMA models

  • ARIMAX: ARIMA model with seasonal dummies as a covariate

A1. Separate Meteorological Season-specific SARIMA models

# Filter to 1 season only
separate_data <- wind_speed_data %>% filter(season == "Summer")
ts_separate <- ts(separate_data$wind_speed, frequency = 24)

autoplot(ts_separate) +
  ggtitle(paste("Wind Speed Time Series -", wind_speed_data$season[1]))

# Fit SARIMA
fit_separate <- auto.arima(ts_separate, seasonal = TRUE)
fit_separate
## Series: ts_separate 
## ARIMA(2,0,1)(0,0,2)[24] with non-zero mean 
## 
## Coefficients:
##             ar1        ar2         ma1       sma1       sma2       mean
##       1.5400995  -0.602653  -0.5402378  0.0663630  0.0356565  4.8115775
## s.e.  0.1014453   0.089210   0.1123488  0.0217245  0.0209964  0.1576854
## 
## sigma^2 = 0.8232304:  log likelihood = -2842.17
## AIC=5698.34   AICc=5698.39   BIC=5738.06
# Check residuals
checkresiduals(fit_separate)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,1)(0,0,2)[24] with non-zero mean
## Q* = 60.5658, df = 43, p-value = 0.0397039
## 
## Model df: 5.   Total lags used: 48

A2. ARIMAX - ARIMA with covariates (Additional Marks)

Further study here in ARIMAX used this: https://robjhyndman.com/hyndsight/arimax/

# 1. Encode seasons and make dummy matrix 

# Ensure season is a factor
wind_speed_data$season <- factor(wind_speed_data$season, levels = c("Summer", "Autumn", "Winter", "Spring"))

# Create dummy variables using model.matrix (excluding 1 to avoid multicollinearity)
season_dummies <- model.matrix(~ season, data = wind_speed_data)[, -1]  # drops intercept (Summer is baseline)

# Check the dummy matrix
head(season_dummies)
##   seasonAutumn seasonWinter seasonSpring
## 1            1            0            0
## 2            1            0            0
## 3            1            0            0
## 4            1            0            0
## 5            1            0            0
## 6            1            0            0
# 2. fit ARIMAX with covariate (exogenous regressors - encoded seasons)
ts_wind <- ts(wind_speed_data$wind_speed, frequency = 24)  # daily frequency

# commented out due to excessive computational time. Results ARIMA(2,0,1)(2,0,0)

# Fit SARIMA with seasonal dummy variables as exogenous regressors
# ARIMAX_sea <- auto.arima(ts_wind,
#                   xreg = season_dummies,
#                   seasonal = TRUE,
#                   stepwise = FALSE,
#                   approximation = FALSE)

# auto.arima with xreg = season_dummies chosen model:
ARIMAX_sea <- Arima(ts_wind,
               order = c(2, 0, 1),
               seasonal = list(order = c(2, 0, 0), period = 24),
               xreg = season_dummies)  # exogenous regressors
summary(ARIMAX_sea)
## Series: ts_wind 
## Regression with ARIMA(2,0,1)(2,0,0)[24] errors 
## 
## Coefficients:
##             ar1         ar2         ma1       sar1       sar2  intercept
##       1.3318332  -0.3958826  -0.2762553  0.0625750  0.0514319  4.6655589
## s.e.  0.0812837   0.0749536   0.0860459  0.0107758  0.0107867  0.2226704
##       seasonAutumn  seasonWinter  seasonSpring
##         -0.6015890     0.0393508     0.0836923
## s.e.     0.3053598     0.3117849     0.3040864
## 
## sigma^2 = 0.742242:  log likelihood = -10987.59
## AIC=21995.19   AICc=21995.21   BIC=22065.84
## 
## Training set error measures:
##                            ME         RMSE          MAE          MPE
## Training set -8.563043744e-05 0.8610866141 0.6366005467 -6.269189817
##                     MAPE         MASE           ACF1
## Training set 19.09981615 0.2771721139 0.004120957358
checkresiduals(ARIMAX_sea)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(2,0,1)(2,0,0)[24] errors
## Q* = 87.287061, df = 43, p-value = 7.599244e-05
## 
## Model df: 5.   Total lags used: 48